library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)
library(ggRandomForests)AAQoL machine learning analysis with unbalanced random forest
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above")))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Demographics
ps(Ethnicity)# A tibble: 7 × 3
Ethnicity n pct
<fct> <int> <dbl>
1 Chinese 639 24.5
2 Asian Indian 574 22.0
3 Filipino 265 10.2
4 Korean 471 18.1
5 Other 144 5.52
6 Vietnamese 514 19.7
7 <NA> 2 0.0767
ps(Gender)# A tibble: 3 × 3
Gender n pct
<fct> <int> <dbl>
1 Female 1425 54.6
2 Male 1157 44.3
3 <NA> 27 1.03
ps(Religion)# A tibble: 8 × 3
Religion n pct
<fct> <int> <dbl>
1 Buddhist 350 13.4
2 Catholic 492 18.9
3 Hindu 479 18.4
4 Muslim 68 2.61
5 None 506 19.4
6 Other 47 1.80
7 Protestant 645 24.7
8 <NA> 22 0.843
ps(`Full Time Employment`)# A tibble: 3 × 3
`Full Time Employment` n pct
<fct> <int> <dbl>
1 0 1458 55.9
2 Employed full time 1144 43.8
3 <NA> 7 0.268
ps(Income)# A tibble: 9 × 3
Income n pct
<fct> <int> <dbl>
1 $0 - $9,999 254 9.74
2 $10,000 - $19,999 205 7.86
3 $20,000 - $29,999 198 7.59
4 $30,000 - $39,999 207 7.93
5 $40,000 - $49,999 181 6.94
6 $50,000 - $59,999 178 6.82
7 $60,000 - $69,999 190 7.28
8 $70,000 and over 993 38.1
9 <NA> 203 7.78
ps(`English Speaking`)# A tibble: 5 × 3
`English Speaking` n pct
<fct> <int> <dbl>
1 Not at all 177 6.78
2 Not well 632 24.2
3 Very well 974 37.3
4 Well 808 31.0
5 <NA> 18 0.690
ps(`English Difficulties`)# A tibble: 5 × 3
`English Difficulties` n pct
<fct> <int> <dbl>
1 Not at all 772 29.6
2 Much 549 21.0
3 Not much 733 28.1
4 Very much 516 19.8
5 <NA> 39 1.49
ps(Discrimination)# A tibble: 3 × 3
Discrimination n pct
<dbl> <int> <dbl>
1 0 1598 61.2
2 1 694 26.6
3 NA 317 12.2
qol |> summarize(age_mean = mean(Age,na.rm=T),
age_sd = sd(Age,na.rm=T),
age_min = min(Age,na.rm=T),
age_max = max(Age,na.rm=T))# A tibble: 1 × 4
age_mean age_sd age_min age_max
<dbl> <dbl> <dbl> <dbl>
1 42.9 17.1 18 98
Source of Information: Family
ps(Family)# A tibble: 4 × 3
Family n pct
<fct> <int> <dbl>
1 3 1 0.0383
2 No 1258 48.2
3 Yes 1331 51.0
4 <NA> 19 0.728
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
# filter(!is.na(Family)) |>
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc",method="brf")
print(rfobj) Sample size: 1926
Frequency of class labels: 928, 998
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 528.373
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swr
Resample size used to grow trees: 1856
Analysis: RF-C
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0754
(OOB) Brier score: 0.23115124
(OOB) Normalized Brier score: 0.92460495
(OOB) AUC: 0.65014728
(OOB) Log-loss: 0.65340355
(OOB) PR-AUC: 0.61312168
(OOB) G-mean: 0.59612122
(OOB) Requested performance error: 0.40387878
Confusion matrix:
predicted
observed No Yes class.error
No 593 335 0.3610
Yes 443 555 0.4439
(OOB) Misclassification rate: 0.403946
print(rfobj) Sample size: 1926
Frequency of class labels: 928, 998
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 528.373
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swr
Resample size used to grow trees: 1856
Analysis: RF-C
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0754
(OOB) Brier score: 0.23115124
(OOB) Normalized Brier score: 0.92460495
(OOB) AUC: 0.65014728
(OOB) Log-loss: 0.65340355
(OOB) PR-AUC: 0.61312168
(OOB) G-mean: 0.59612122
(OOB) Requested performance error: 0.40387878
Confusion matrix:
predicted
observed No Yes class.error
No 593 335 0.3610
Yes 443 555 0.4439
(OOB) Misclassification rate: 0.403946
plot(rfobj,plots.one.page = FALSE)

all No Yes
Age 0.0126 NA NA
Ethnicity 0.0070 NA NA
EnglishDiff 0.0048 NA NA
Family.Respect 0.0045 NA NA
Similar.Values 0.0019 NA NA
Dental.Insurance 0.0015 NA NA
Loyalty 0.0007 NA NA
Gender -0.0015 NA NA
Spend.Time.Together -0.0017 NA NA
Successful.Family -0.0017 NA NA
EnglishSpeak -0.0021 NA NA
Togetherness -0.0022 NA NA
Feel.Close -0.0024 NA NA
Discrimination -0.0031 NA NA
Family.Pride -0.0031 NA NA
Get.Along -0.0037 NA NA
Close.knit.Community -0.0043 NA NA
Helpful.Family -0.0045 NA NA
See.Friends -0.0046 NA NA
Health.Insurance -0.0052 NA NA
Trust -0.0058 NA NA
Income_median -0.0058 NA NA
Community.Shares.Values -0.0062 NA NA
Religious.Attendance -0.0065 NA NA
Employment -0.0065 NA NA
Helpful.Community -0.0067 NA NA
rfobj$importance all No Yes
Ethnicity 0.0069551564 NA NA
Age 0.0125900668 NA NA
Gender -0.0014680505 NA NA
Religion -0.0084388205 NA NA
Employment -0.0064719112 NA NA
Income_median -0.0058265061 NA NA
EnglishSpeak -0.0021443278 NA NA
EnglishDiff 0.0048111707 NA NA
See.Family -0.0100360757 NA NA
Close.Family -0.0075970273 NA NA
Helpful.Family -0.0044823810 NA NA
See.Friends -0.0045875412 NA NA
Close.Friends -0.0121965006 NA NA
Helpful.Friends -0.0142166122 NA NA
Family.Respect 0.0044917030 NA NA
Similar.Values 0.0019048933 NA NA
Successful.Family -0.0016974746 NA NA
Trust -0.0057673093 NA NA
Loyalty 0.0007367428 NA NA
Family.Pride -0.0031190247 NA NA
Expression -0.0080815145 NA NA
Spend.Time.Together -0.0016685756 NA NA
Feel.Close -0.0024267114 NA NA
Togetherness -0.0021741062 NA NA
Religious.Attendance -0.0064522008 NA NA
Religious.Importance -0.0089717074 NA NA
Close.knit.Community -0.0043313496 NA NA
Helpful.Community -0.0066671893 NA NA
Community.Shares.Values -0.0062380318 NA NA
Get.Along -0.0037187432 NA NA
Community.Trust -0.0093222463 NA NA
Health.Insurance -0.0052315829 NA NA
Dental.Insurance 0.0014789379 NA NA
Discrimination -0.0030703725 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Cross validation in Random Forests (Run when you have time)
# myTrainingControl <- trainControl(method = "repeatedcv",
# number = 10,
# repeats = 3,
# savePredictions = TRUE,
# classProbs = TRUE,
# verboseIter = TRUE,
# search = "grid")
#
#
# set.seed(123)
#
# model_rf <- train(Family~ .,
# data=rfdata,
# method = 'rf',
# metric = "Accuracy",
# trControl = myTrainingControl,
# importance = TRUE
# )
#
# varImp(model_rf)Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Family~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
#
# rfsrc(Family~.,data=train, importance="permute",
# perf.type="gmean",
# splitrule="auc",
# block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1542
Frequency of class labels: 756, 786
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 335.6223
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 975
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0397
(OOB) Brier score: 0.18671405
(OOB) Normalized Brier score: 0.74685618
(OOB) AUC: 0.85810631
(OOB) Log-loss: 0.5603946
(OOB) PR-AUC: 0.85597293
(OOB) G-mean: 0.77922288
(OOB) Requested performance error: 0.22077712
Confusion matrix:
predicted
observed Yes No class.error
Yes 581 175 0.2315
No 165 621 0.2099
(OOB) Misclassification rate: 0.2204929
plot(rfobj,plots.one.page = FALSE)

all Yes No
Ethnicity 0.0304 NA NA
Discrimination 0.0243 NA NA
Religion 0.0228 NA NA
EnglishDiff 0.0207 NA NA
Close.Family 0.0202 NA NA
Religious.Attendance 0.0188 NA NA
Age 0.0185 NA NA
Get.Along 0.0160 NA NA
Spend.Time.Together 0.0155 NA NA
Income_median 0.0155 NA NA
See.Family 0.0154 NA NA
Gender 0.0138 NA NA
Religious.Importance 0.0137 NA NA
Employment 0.0135 NA NA
See.Friends 0.0130 NA NA
EnglishSpeak 0.0128 NA NA
Successful.Family 0.0127 NA NA
Similar.Values 0.0122 NA NA
Close.Friends 0.0117 NA NA
Togetherness 0.0113 NA NA
Trust 0.0104 NA NA
Close.knit.Community 0.0095 NA NA
Helpful.Community 0.0091 NA NA
Feel.Close 0.0087 NA NA
Expression 0.0078 NA NA
Dental.Insurance 0.0077 NA NA
rfobj$importance all Yes No
Ethnicity 3.043835e-02 NA NA
Age 1.847720e-02 NA NA
Gender 1.379629e-02 NA NA
Religion 2.283324e-02 NA NA
Employment 1.352591e-02 NA NA
Income_median 1.545365e-02 NA NA
EnglishSpeak 1.275595e-02 NA NA
EnglishDiff 2.072475e-02 NA NA
See.Family 1.538315e-02 NA NA
Close.Family 2.018801e-02 NA NA
Helpful.Family 6.238437e-03 NA NA
See.Friends 1.302496e-02 NA NA
Close.Friends 1.168193e-02 NA NA
Helpful.Friends 6.299399e-03 NA NA
Family.Respect 5.191944e-03 NA NA
Similar.Values 1.219078e-02 NA NA
Successful.Family 1.272191e-02 NA NA
Trust 1.042880e-02 NA NA
Loyalty 3.143308e-03 NA NA
Family.Pride 3.182341e-03 NA NA
Expression 7.832653e-03 NA NA
Spend.Time.Together 1.553298e-02 NA NA
Feel.Close 8.655568e-03 NA NA
Togetherness 1.130709e-02 NA NA
Religious.Attendance 1.884340e-02 NA NA
Religious.Importance 1.374572e-02 NA NA
Close.knit.Community 9.496852e-03 NA NA
Helpful.Community 9.130726e-03 NA NA
Community.Shares.Values 6.267829e-03 NA NA
Get.Along 1.601573e-02 NA NA
Community.Trust 5.738946e-03 NA NA
Health.Insurance -4.211306e-05 NA NA
Dental.Insurance 7.745393e-03 NA NA
Discrimination 2.429195e-02 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="family_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
199.0000000 185.0000000 1.0756757 0.4817708 0.5351351 0.5226131
prec npv misclass brier brier.norm auc
0.5103093 0.5473684 0.4713542 0.2451071 0.9804283 0.5738693
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.6832185 0.5224274 0.5284946 0.4817708 0.5537617 0.5256322
F1modgmean gmean
0.5286658 0.5288370
test_rf$importance all Yes No
Ethnicity 7.773687e-03 NA NA
Age 1.391193e-02 NA NA
Gender 1.481661e-03 NA NA
Religion -6.770110e-05 NA NA
Employment 3.369657e-04 NA NA
Income_median 1.341765e-04 NA NA
EnglishSpeak 1.737331e-03 NA NA
EnglishDiff 1.686164e-04 NA NA
See.Family 3.991785e-04 NA NA
Close.Family -3.209117e-03 NA NA
Helpful.Family 1.130363e-03 NA NA
See.Friends 1.588099e-03 NA NA
Close.Friends 1.389035e-03 NA NA
Helpful.Friends -1.563947e-03 NA NA
Family.Respect -7.517450e-04 NA NA
Similar.Values -3.947004e-04 NA NA
Successful.Family 1.621815e-03 NA NA
Trust -9.002110e-04 NA NA
Loyalty -1.147870e-04 NA NA
Family.Pride 1.106295e-03 NA NA
Expression -1.092909e-03 NA NA
Spend.Time.Together -8.849227e-04 NA NA
Feel.Close -3.067122e-05 NA NA
Togetherness 5.981813e-05 NA NA
Religious.Attendance 5.583681e-04 NA NA
Religious.Importance -5.044237e-04 NA NA
Close.knit.Community -4.833768e-04 NA NA
Helpful.Community -8.145869e-04 NA NA
Community.Shares.Values -2.730269e-04 NA NA
Get.Along 1.300196e-03 NA NA
Community.Trust -5.415171e-04 NA NA
Health.Insurance 1.750978e-04 NA NA
Dental.Insurance 1.569785e-03 NA NA
Discrimination 8.925712e-04 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="family_test_VIMP.png",width=5,height=5,units="in")Source of Information: Health Professionals
ps(`Heal Professionals`)# A tibble: 3 × 3
`Heal Professionals` n pct
<fct> <int> <dbl>
1 No 1326 50.8
2 Yes 1264 48.4
3 <NA> 19 0.728
rfdata <- qol |>
select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imbalanced(Heal.Professionals ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")->rfobj
print(rfobj) Sample size: 1927
Frequency of class labels: 925, 1002
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 529.8617
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1218
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0832
(OOB) Brier score: 0.22739259
(OOB) Normalized Brier score: 0.90957038
(OOB) AUC: 0.67375951
(OOB) Log-loss: 0.64656947
(OOB) PR-AUC: 0.63055041
(OOB) G-mean: 0.62051216
(OOB) Requested performance error: 0.37948784
Confusion matrix:
predicted
observed No Yes class.error
No 562 363 0.3924
Yes 367 635 0.3663
(OOB) Misclassification rate: 0.3788272
plot(rfobj,plots.one.page = FALSE)

all No Yes
EnglishSpeak 0.0087 NA NA
Get.Along 0.0077 NA NA
Community.Shares.Values 0.0076 NA NA
Spend.Time.Together 0.0068 NA NA
Expression 0.0065 NA NA
Gender 0.0064 NA NA
Age 0.0056 NA NA
Similar.Values 0.0049 NA NA
Health.Insurance 0.0049 NA NA
Feel.Close 0.0048 NA NA
Community.Trust 0.0045 NA NA
Family.Pride 0.0045 NA NA
Income_median 0.0041 NA NA
Discrimination 0.0039 NA NA
Dental.Insurance 0.0034 NA NA
Helpful.Community 0.0031 NA NA
Family.Respect 0.0030 NA NA
Trust 0.0026 NA NA
Close.knit.Community 0.0012 NA NA
Loyalty 0.0006 NA NA
Religious.Importance -0.0008 NA NA
Togetherness -0.0008 NA NA
Ethnicity -0.0018 NA NA
Religious.Attendance -0.0021 NA NA
Religion -0.0023 NA NA
Helpful.Family -0.0023 NA NA
rfobj$importance all No Yes
Ethnicity -0.0018143542 NA NA
Age 0.0056014941 NA NA
Gender 0.0063757609 NA NA
Religion -0.0022995991 NA NA
Employment -0.0024295113 NA NA
Income_median 0.0041083863 NA NA
EnglishSpeak 0.0086779884 NA NA
EnglishDiff -0.0028554330 NA NA
See.Family -0.0040666801 NA NA
Close.Family -0.0026329858 NA NA
Helpful.Family -0.0023203869 NA NA
See.Friends -0.0059329832 NA NA
Close.Friends -0.0071864210 NA NA
Helpful.Friends -0.0065896944 NA NA
Family.Respect 0.0029979134 NA NA
Similar.Values 0.0049175798 NA NA
Successful.Family -0.0029246611 NA NA
Trust 0.0026337288 NA NA
Loyalty 0.0005523029 NA NA
Family.Pride 0.0045040927 NA NA
Expression 0.0065145648 NA NA
Spend.Time.Together 0.0067966620 NA NA
Feel.Close 0.0048483539 NA NA
Togetherness -0.0008427275 NA NA
Religious.Attendance -0.0020812857 NA NA
Religious.Importance -0.0007880284 NA NA
Close.knit.Community 0.0012392363 NA NA
Helpful.Community 0.0030590686 NA NA
Community.Shares.Values 0.0075557164 NA NA
Get.Along 0.0077018301 NA NA
Community.Trust 0.0045040927 NA NA
Health.Insurance 0.0049053114 NA NA
Dental.Insurance 0.0033728024 NA NA
Discrimination 0.0039211271 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
pos<- rfdata |> filter(Heal.Professionals=="Yes")
neg <- rfdata |> filter(Heal.Professionals==0)
set.seed(222)
imbal_index <- createDataPartition(rfdata$Heal.Professionals,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Heal.Professionals~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Heal.Professionals ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1542
Frequency of class labels: 756, 786
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 323.5823
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 975
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0397
(OOB) Brier score: 0.17475468
(OOB) Normalized Brier score: 0.6990187
(OOB) AUC: 0.86249361
(OOB) Log-loss: 0.53255883
(OOB) PR-AUC: 0.8529244
(OOB) G-mean: 0.78548997
(OOB) Requested performance error: 0.21451003
Confusion matrix:
predicted
observed Yes No class.error
Yes 604 152 0.2011
No 179 607 0.2277
(OOB) Misclassification rate: 0.2146563
print(rfobj) Sample size: 1542
Frequency of class labels: 756, 786
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 323.5823
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 975
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0397
(OOB) Brier score: 0.17475468
(OOB) Normalized Brier score: 0.6990187
(OOB) AUC: 0.86249361
(OOB) Log-loss: 0.53255883
(OOB) PR-AUC: 0.8529244
(OOB) G-mean: 0.78548997
(OOB) Requested performance error: 0.21451003
Confusion matrix:
predicted
observed Yes No class.error
Yes 604 152 0.2011
No 179 607 0.2277
(OOB) Misclassification rate: 0.2146563
plot(rfobj,plots.one.page = FALSE)

all Yes No
EnglishSpeak 0.0376 NA NA
Religion 0.0267 NA NA
EnglishDiff 0.0248 NA NA
Religious.Importance 0.0247 NA NA
Ethnicity 0.0242 NA NA
See.Friends 0.0188 NA NA
Age 0.0156 NA NA
Discrimination 0.0143 NA NA
Religious.Attendance 0.0143 NA NA
Dental.Insurance 0.0143 NA NA
Close.knit.Community 0.0130 NA NA
Helpful.Community 0.0117 NA NA
Get.Along 0.0117 NA NA
Helpful.Family 0.0111 NA NA
See.Family 0.0110 NA NA
Close.Family 0.0098 NA NA
Community.Shares.Values 0.0091 NA NA
Family.Respect 0.0091 NA NA
Close.Friends 0.0078 NA NA
Feel.Close 0.0071 NA NA
Gender 0.0071 NA NA
Community.Trust 0.0065 NA NA
Togetherness 0.0065 NA NA
Income_median 0.0058 NA NA
Employment 0.0058 NA NA
Helpful.Friends 0.0052 NA NA
rfobj$importance all Yes No
Ethnicity 0.024183711 NA NA
Age 0.015586870 NA NA
Gender 0.007136857 NA NA
Religion 0.026725642 NA NA
Employment 0.005836350 NA NA
Income_median 0.005844984 NA NA
EnglishSpeak 0.037625231 NA NA
EnglishDiff 0.024767514 NA NA
See.Family 0.011034099 NA NA
Close.Family 0.009780208 NA NA
Helpful.Family 0.011077560 NA NA
See.Friends 0.018808992 NA NA
Close.Friends 0.007781435 NA NA
Helpful.Friends 0.005219260 NA NA
Family.Respect 0.009078697 NA NA
Similar.Values 0.004539086 NA NA
Successful.Family 0.003891799 NA NA
Trust 0.004539086 NA NA
Loyalty 0.005193379 NA NA
Family.Pride 0.004542318 NA NA
Expression 0.003889646 NA NA
Spend.Time.Together 0.005193379 NA NA
Feel.Close 0.007149830 NA NA
Togetherness 0.006484172 NA NA
Religious.Attendance 0.014274295 NA NA
Religious.Importance 0.024652487 NA NA
Close.knit.Community 0.012970486 NA NA
Helpful.Community 0.011679748 NA NA
Community.Shares.Values 0.009080865 NA NA
Get.Along 0.011673223 NA NA
Community.Trust 0.006484172 NA NA
Health.Insurance 0.001944557 NA NA
Dental.Insurance 0.014267749 NA NA
Discrimination 0.014274295 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="healthpro_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
200.0000000 185.0000000 1.0810811 0.4805195 0.6378378 0.6200000
prec npv misclass brier brier.norm auc
0.6082474 0.6492147 0.3714286 0.2345217 0.9380869 0.6493243
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.6617049 0.6226913 0.6284279 0.4805195 0.6120594 0.6257735
F1modgmean gmean
0.6286418 0.6288557
test_rf$importance all Yes No
Ethnicity 1.491325e-03 NA NA
Age 3.472339e-03 NA NA
Gender 4.084452e-06 NA NA
Religion 1.941187e-03 NA NA
Employment -3.785221e-07 NA NA
Income_median 1.264392e-03 NA NA
EnglishSpeak 1.904263e-02 NA NA
EnglishDiff 1.101844e-02 NA NA
See.Family 3.399809e-03 NA NA
Close.Family 1.585158e-03 NA NA
Helpful.Family 1.067564e-03 NA NA
See.Friends 2.320842e-03 NA NA
Close.Friends -7.445241e-04 NA NA
Helpful.Friends 1.773664e-03 NA NA
Family.Respect 1.169300e-03 NA NA
Similar.Values 4.038333e-04 NA NA
Successful.Family -3.801373e-04 NA NA
Trust -2.101630e-04 NA NA
Loyalty -3.523717e-04 NA NA
Family.Pride -2.651401e-04 NA NA
Expression 3.227736e-04 NA NA
Spend.Time.Together 3.551350e-04 NA NA
Feel.Close -2.444042e-04 NA NA
Togetherness -7.700216e-04 NA NA
Religious.Attendance 2.228277e-04 NA NA
Religious.Importance 1.200384e-03 NA NA
Close.knit.Community -2.856467e-04 NA NA
Helpful.Community 7.287934e-04 NA NA
Community.Shares.Values -4.119143e-04 NA NA
Get.Along 4.864171e-04 NA NA
Community.Trust 1.569440e-04 NA NA
Health.Insurance 1.849156e-03 NA NA
Dental.Insurance 5.658109e-03 NA NA
Discrimination 2.479901e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="healthpro_test_VIMP.png",width=5,height=5,units="in")Health Insurance
ps(`Health Insurance`)# A tibble: 3 × 3
`Health Insurance` n pct
<fct> <int> <dbl>
1 0 381 14.6
2 Yes 2207 84.6
3 <NA> 21 0.805
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1936
Frequency of class labels: 259, 1677
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 295.831
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 1224
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 6.4749
(OOB) Brier score: 0.10519154
(OOB) Normalized Brier score: 0.42076617
(OOB) AUC: 0.73382845
(OOB) Log-loss: 0.35223368
(OOB) PR-AUC: 0.3214195
(OOB) G-mean: 0.66878662
(OOB) Requested performance error: 0.33121338
Confusion matrix:
predicted
observed 0 Yes class.error
0 203 56 0.2162
Yes 720 957 0.4293
(OOB) Misclassification rate: 0.4008264
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1677.0000000 259.0000000 6.4749035 0.1337810 0.7837838 0.5706619
prec npv misclass brier brier.norm auc
0.2199350 0.9447187 0.4008264 0.1051915 0.4207662 0.7338284
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.3522337 0.3434856 0.4633100 0.1337810 0.3214195 0.5061361
F1modgmean gmean
0.5660483 0.6687866
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
#
#
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1550
Frequency of class labels: 760, 790
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 277.4057
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 980
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0395
(OOB) Brier score: 0.12398991
(OOB) Normalized Brier score: 0.49595964
(OOB) AUC: 0.96556213
(OOB) Log-loss: 0.4175614
(OOB) PR-AUC: 0.96399298
(OOB) G-mean: 0.89402301
(OOB) Requested performance error: 0.10597699
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 661 99 0.1303
0 64 726 0.0810
(OOB) Misclassification rate: 0.1051613
print(rfobj) Sample size: 1550
Frequency of class labels: 760, 790
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 277.4057
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 980
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0395
(OOB) Brier score: 0.12398991
(OOB) Normalized Brier score: 0.49595964
(OOB) AUC: 0.96556213
(OOB) Log-loss: 0.4175614
(OOB) PR-AUC: 0.96399298
(OOB) G-mean: 0.89402301
(OOB) Requested performance error: 0.10597699
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 661 99 0.1303
0 64 726 0.0810
(OOB) Misclassification rate: 0.1051613
plot(rfobj,plots.one.page = FALSE)

all Yes 0
EnglishSpeak 0.0218 NA NA
Religion 0.0201 NA NA
EnglishDiff 0.0176 NA NA
Ethnicity 0.0143 NA NA
Income_median 0.0132 NA NA
Community.Shares.Values 0.0129 NA NA
Helpful.Community 0.0118 NA NA
Helpful.Family 0.0115 NA NA
Close.Family 0.0108 NA NA
Employment 0.0104 NA NA
Get.Along 0.0101 NA NA
Religious.Importance 0.0094 NA NA
Religious.Attendance 0.0094 NA NA
Community.Trust 0.0094 NA NA
Successful.Family 0.0077 NA NA
Age 0.0076 NA NA
Close.knit.Community 0.0069 NA NA
Close.Friends 0.0065 NA NA
Similar.Values 0.0065 NA NA
Feel.Close 0.0050 NA NA
Helpful.Friends 0.0046 NA NA
Togetherness 0.0044 NA NA
Spend.Time.Together 0.0039 NA NA
See.Friends 0.0038 NA NA
Gender 0.0031 NA NA
Family.Pride 0.0030 NA NA
rfobj$importance all Yes 0
Ethnicity 0.014316200 NA NA
Age 0.007633590 NA NA
Gender 0.003083911 NA NA
Religion 0.020066035 NA NA
Employment 0.010442500 NA NA
Income_median 0.013158258 NA NA
EnglishSpeak 0.021816327 NA NA
EnglishDiff 0.017565438 NA NA
See.Family 0.002583976 NA NA
Close.Family 0.010779038 NA NA
Helpful.Family 0.011509116 NA NA
See.Friends 0.003816097 NA NA
Close.Friends 0.006521899 NA NA
Helpful.Friends 0.004615363 NA NA
Family.Respect 0.001353556 NA NA
Similar.Values 0.006459971 NA NA
Successful.Family 0.007691842 NA NA
Trust 0.002645635 NA NA
Loyalty 0.002645635 NA NA
Family.Pride 0.003028764 NA NA
Expression 0.001907875 NA NA
Spend.Time.Together 0.003937719 NA NA
Feel.Close 0.005049925 NA NA
Togetherness 0.004375695 NA NA
Religious.Attendance 0.009433654 NA NA
Religious.Importance 0.009433654 NA NA
Close.knit.Community 0.006850379 NA NA
Helpful.Community 0.011776207 NA NA
Community.Shares.Values 0.012920046 NA NA
Get.Along 0.010053332 NA NA
Community.Trust 0.009381877 NA NA
Discrimination 0.001482306 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="HIns_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
335.0000000 51.0000000 6.5686275 0.1321244 0.5686275 0.5522388
prec npv misclass brier brier.norm auc
0.1620112 0.8937198 0.4455959 0.1114667 0.4458667 0.6172666
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.3763007 0.2521739 0.3682982 0.1321244 0.2359474 0.4062736
F1modgmean gmean
0.4643357 0.5603732
test_rf$importance all Yes 0
Ethnicity -0.0017525858 NA NA
Age -0.0052636834 NA NA
Gender 0.0005795693 NA NA
Religion 0.0036039587 NA NA
Employment 0.0034489475 NA NA
Income_median 0.0270547814 NA NA
EnglishSpeak 0.0025917176 NA NA
EnglishDiff -0.0066131418 NA NA
See.Family 0.0054740860 NA NA
Close.Family -0.0044527376 NA NA
Helpful.Family 0.0031669773 NA NA
See.Friends -0.0011282253 NA NA
Close.Friends 0.0011439600 NA NA
Helpful.Friends 0.0047502384 NA NA
Family.Respect -0.0003789466 NA NA
Similar.Values -0.0005181470 NA NA
Successful.Family 0.0002480748 NA NA
Trust -0.0019574952 NA NA
Loyalty 0.0004630898 NA NA
Family.Pride -0.0003169345 NA NA
Expression 0.0007673952 NA NA
Spend.Time.Together 0.0004440892 NA NA
Feel.Close 0.0014248742 NA NA
Togetherness 0.0011873526 NA NA
Religious.Attendance 0.0024413037 NA NA
Religious.Importance 0.0042524577 NA NA
Close.knit.Community -0.0019943671 NA NA
Helpful.Community -0.0012136076 NA NA
Community.Shares.Values -0.0017010414 NA NA
Get.Along -0.0033894357 NA NA
Community.Trust -0.0025111518 NA NA
Discrimination -0.0018204015 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="HIns_test_VIMP.png",width=5,height=5,units="in")Dental Insurance
ps(`Dental Insurance`)# A tibble: 3 × 3
`Dental Insurance` n pct
<fct> <int> <dbl>
1 0 1050 40.2
2 Yes 1529 58.6
3 <NA> 30 1.15
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1932
Frequency of class labels: 760, 1172
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 451.0523
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 1221
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.5421
(OOB) Brier score: 0.17933324
(OOB) Normalized Brier score: 0.71733296
(OOB) AUC: 0.79775799
(OOB) Log-loss: 0.53718287
(OOB) PR-AUC: 0.71577982
(OOB) G-mean: 0.72787095
(OOB) Requested performance error: 0.27212905
Confusion matrix:
predicted
observed 0 Yes class.error
0 572 188 0.2474
Yes 347 825 0.2961
(OOB) Misclassification rate: 0.2769151
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1172.0000000 760.0000000 1.5421053 0.3933747 0.7526316 0.7039249
prec npv misclass brier brier.norm auc
0.6224157 0.8144126 0.2769151 0.1793332 0.7173330 0.7977580
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5371829 0.6813580 0.7163581 0.3933747 0.7157798 0.7046145
F1modgmean gmean
0.7221145 0.7278710
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1546
Frequency of class labels: 757, 789
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 284.784
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 977
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0423
(OOB) Brier score: 0.13865716
(OOB) Normalized Brier score: 0.55462863
(OOB) AUC: 0.91423938
(OOB) Log-loss: 0.44785029
(OOB) PR-AUC: 0.90865796
(OOB) G-mean: 0.84719663
(OOB) Requested performance error: 0.15280337
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 624 133 0.1757
0 102 687 0.1293
(OOB) Misclassification rate: 0.1520052
print(rfobj) Sample size: 1546
Frequency of class labels: 757, 789
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 284.784
No. of variables tried at each split: 6
Total no. of variables: 32
Resampling used to grow trees: swor
Resample size used to grow trees: 977
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0423
(OOB) Brier score: 0.13865716
(OOB) Normalized Brier score: 0.55462863
(OOB) AUC: 0.91423938
(OOB) Log-loss: 0.44785029
(OOB) PR-AUC: 0.90865796
(OOB) G-mean: 0.84719663
(OOB) Requested performance error: 0.15280337
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 624 133 0.1757
0 102 687 0.1293
(OOB) Misclassification rate: 0.1520052
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Income_median 0.0317 NA NA
EnglishSpeak 0.0308 NA NA
Employment 0.0242 NA NA
Age 0.0211 NA NA
EnglishDiff 0.0203 NA NA
Ethnicity 0.0186 NA NA
Religion 0.0155 NA NA
Helpful.Family 0.0135 NA NA
Discrimination 0.0104 NA NA
Religious.Attendance 0.0095 NA NA
See.Friends 0.0087 NA NA
Close.Friends 0.0085 NA NA
Close.knit.Community 0.0082 NA NA
See.Family 0.0077 NA NA
Helpful.Friends 0.0075 NA NA
Community.Shares.Values 0.0070 NA NA
Religious.Importance 0.0070 NA NA
Expression 0.0070 NA NA
Helpful.Community 0.0065 NA NA
Spend.Time.Together 0.0058 NA NA
Community.Trust 0.0057 NA NA
Get.Along 0.0052 NA NA
Close.Family 0.0043 NA NA
Feel.Close 0.0039 NA NA
Trust 0.0038 NA NA
Gender 0.0032 NA NA
rfobj$importance all Yes 0
Ethnicity 0.0186314687 NA NA
Age 0.0211470094 NA NA
Gender 0.0032076047 NA NA
Religion 0.0155455542 NA NA
Employment 0.0241664494 NA NA
Income_median 0.0317198196 NA NA
EnglishSpeak 0.0308302749 NA NA
EnglishDiff 0.0203154072 NA NA
See.Family 0.0077108636 NA NA
Close.Family 0.0043271910 NA NA
Helpful.Family 0.0135096856 NA NA
See.Friends 0.0087236360 NA NA
Close.Friends 0.0084521130 NA NA
Helpful.Friends 0.0074835320 NA NA
Family.Respect 0.0011747097 NA NA
Similar.Values 0.0024124893 NA NA
Successful.Family -0.0007970438 NA NA
Trust 0.0037652270 NA NA
Loyalty 0.0024699663 NA NA
Family.Pride 0.0024699663 NA NA
Expression 0.0069742534 NA NA
Spend.Time.Together 0.0057984113 NA NA
Feel.Close 0.0039498593 NA NA
Togetherness 0.0012954372 NA NA
Religious.Attendance 0.0095077529 NA NA
Religious.Importance 0.0070330389 NA NA
Close.knit.Community 0.0082126069 NA NA
Helpful.Community 0.0064772321 NA NA
Community.Shares.Values 0.0070330389 NA NA
Get.Along 0.0051817764 NA NA
Community.Trust 0.0057377223 NA NA
Discrimination 0.0104276531 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="DIns_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
234.0000000 152.0000000 1.5394737 0.3937824 0.8157895 0.6880342
prec npv misclass brier brier.norm auc
0.6294416 0.8518519 0.2616580 0.1838678 0.7354711 0.7966712
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5503655 0.7106017 0.7350448 0.3937824 0.7006867 0.7298977
F1modgmean gmean
0.7421192 0.7491936
test_rf$importance all Yes 0
Ethnicity 6.955105e-03 NA NA
Age 1.049989e-02 NA NA
Gender 5.533942e-04 NA NA
Religion 4.172060e-04 NA NA
Employment 1.801953e-02 NA NA
Income_median 6.474625e-02 NA NA
EnglishSpeak 2.222726e-02 NA NA
EnglishDiff 6.401071e-03 NA NA
See.Family 1.218979e-03 NA NA
Close.Family 8.783462e-04 NA NA
Helpful.Family 3.347928e-03 NA NA
See.Friends 8.517619e-04 NA NA
Close.Friends 4.657271e-03 NA NA
Helpful.Friends 4.084200e-03 NA NA
Family.Respect 6.792115e-04 NA NA
Similar.Values 5.809173e-05 NA NA
Successful.Family 8.622585e-04 NA NA
Trust 1.698674e-03 NA NA
Loyalty -4.762300e-04 NA NA
Family.Pride 2.149872e-04 NA NA
Expression 1.251987e-03 NA NA
Spend.Time.Together 1.459530e-03 NA NA
Feel.Close 3.209875e-04 NA NA
Togetherness 4.915094e-04 NA NA
Religious.Attendance -2.660509e-04 NA NA
Religious.Importance 4.327772e-04 NA NA
Close.knit.Community 2.316384e-03 NA NA
Helpful.Community 8.348674e-04 NA NA
Community.Shares.Values 1.343737e-04 NA NA
Get.Along 1.595639e-03 NA NA
Community.Trust -7.213593e-04 NA NA
Discrimination -1.198574e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="DIns_test_VIMP.png",width=5,height=5,units="in")Physical Checkup
ps(`Physical Check-up`)# A tibble: 3 × 3
`Physical Check-up` n pct
<fct> <int> <dbl>
1 0 833 31.9
2 Yes 1740 66.7
3 <NA> 36 1.38
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1918
Frequency of class labels: 614, 1304
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 451.908
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1212
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 2.1238
(OOB) Brier score: 0.18414117
(OOB) Normalized Brier score: 0.73656469
(OOB) AUC: 0.74369423
(OOB) Log-loss: 0.54902518
(OOB) PR-AUC: 0.55768871
(OOB) G-mean: 0.6964641
(OOB) Requested performance error: 0.3035359
Confusion matrix:
predicted
observed 0 Yes class.error
0 432 182 0.2964
Yes 405 899 0.3106
(OOB) Misclassification rate: 0.306048
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0451 NA NA
Health.Insurance 0.0367 NA NA
Dental.Insurance 0.0277 NA NA
Gender 0.0172 NA NA
Income_median 0.0074 NA NA
EnglishDiff 0.0061 NA NA
Community.Shares.Values 0.0055 NA NA
Employment 0.0055 NA NA
Discrimination 0.0045 NA NA
Togetherness 0.0043 NA NA
EnglishSpeak 0.0035 NA NA
Helpful.Family 0.0032 NA NA
Close.knit.Community 0.0029 NA NA
Religious.Importance 0.0023 NA NA
Close.Family 0.0019 NA NA
Get.Along 0.0019 NA NA
Religion 0.0016 NA NA
See.Family 0.0016 NA NA
Ethnicity 0.0012 NA NA
Loyalty 0.0008 NA NA
Family.Respect 0.0000 NA NA
Trust -0.0001 NA NA
Religious.Attendance -0.0005 NA NA
See.Friends -0.0008 NA NA
Community.Trust -0.0008 NA NA
Family.Pride -0.0015 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1304.0000000 614.0000000 2.1237785 0.3201251 0.7035831 0.6894172
prec npv misclass brier brier.norm auc
0.5161290 0.8316374 0.3060480 0.1841412 0.7365647 0.7436942
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5490252 0.5954514 0.6653643 0.3201251 0.5576887 0.6459578
F1modgmean gmean
0.6809142 0.6964641
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1536
Frequency of class labels: 754, 782
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 297.9053
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 971
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0371
(OOB) Brier score: 0.14734902
(OOB) Normalized Brier score: 0.58939609
(OOB) AUC: 0.9177545
(OOB) Log-loss: 0.47059335
(OOB) PR-AUC: 0.91289228
(OOB) G-mean: 0.84443067
(OOB) Requested performance error: 0.15556933
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 659 95 0.1260
0 144 638 0.1841
(OOB) Misclassification rate: 0.155599
print(rfobj) Sample size: 1536
Frequency of class labels: 754, 782
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 297.9053
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 971
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0371
(OOB) Brier score: 0.14734902
(OOB) Normalized Brier score: 0.58939609
(OOB) AUC: 0.9177545
(OOB) Log-loss: 0.47059335
(OOB) PR-AUC: 0.91289228
(OOB) G-mean: 0.84443067
(OOB) Requested performance error: 0.15556933
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 659 95 0.1260
0 144 638 0.1841
(OOB) Misclassification rate: 0.155599
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Age 0.0339 NA NA
Ethnicity 0.0271 NA NA
Gender 0.0239 NA NA
EnglishDiff 0.0212 NA NA
Health.Insurance 0.0201 NA NA
Helpful.Family 0.0157 NA NA
Discrimination 0.0150 NA NA
Religion 0.0145 NA NA
Income_median 0.0137 NA NA
EnglishSpeak 0.0132 NA NA
Community.Shares.Values 0.0131 NA NA
Helpful.Community 0.0123 NA NA
Close.Family 0.0119 NA NA
Religious.Importance 0.0112 NA NA
Close.Friends 0.0104 NA NA
See.Family 0.0104 NA NA
Spend.Time.Together 0.0098 NA NA
Religious.Attendance 0.0085 NA NA
Helpful.Friends 0.0066 NA NA
Employment 0.0065 NA NA
Expression 0.0059 NA NA
Successful.Family 0.0059 NA NA
Community.Trust 0.0052 NA NA
Trust 0.0052 NA NA
Close.knit.Community 0.0047 NA NA
See.Friends 0.0040 NA NA
rfobj$importance all Yes 0
Ethnicity 0.0271351212 NA NA
Age 0.0339397455 NA NA
Gender 0.0239012227 NA NA
Religion 0.0144600197 NA NA
Employment 0.0065346238 NA NA
Income_median 0.0137359406 NA NA
EnglishSpeak 0.0132246656 NA NA
EnglishDiff 0.0211588431 NA NA
See.Family 0.0103506437 NA NA
Close.Family 0.0118872456 NA NA
Helpful.Family 0.0157320061 NA NA
See.Friends 0.0039800567 NA NA
Close.Friends 0.0104197808 NA NA
Helpful.Friends 0.0065852278 NA NA
Family.Respect 0.0006851545 NA NA
Similar.Values 0.0018920576 NA NA
Successful.Family 0.0058507574 NA NA
Trust 0.0051714898 NA NA
Loyalty 0.0019645267 NA NA
Family.Pride -0.0012563461 NA NA
Expression 0.0059205347 NA NA
Spend.Time.Together 0.0098028332 NA NA
Feel.Close 0.0039296096 NA NA
Togetherness 0.0019433893 NA NA
Religious.Attendance 0.0084556719 NA NA
Religious.Importance 0.0111877913 NA NA
Close.knit.Community 0.0047058219 NA NA
Helpful.Community 0.0123436839 NA NA
Community.Shares.Values 0.0130716504 NA NA
Get.Along 0.0038690770 NA NA
Community.Trust 0.0051896773 NA NA
Health.Insurance 0.0200749381 NA NA
Dental.Insurance 0.0005655640 NA NA
Discrimination 0.0150385112 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="PChk_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
260.0000000 122.0000000 2.1311475 0.3193717 0.6065574 0.7000000
prec npv misclass brier brier.norm auc
0.4868421 0.7913043 0.3298429 0.1947935 0.7791741 0.6841110
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5772494 0.5401460 0.6254876 0.3193717 0.5048314 0.5958759
F1modgmean gmean
0.6385467 0.6516058
test_rf$importance all Yes 0
Ethnicity -2.208910e-03 NA NA
Age 1.268450e-02 NA NA
Gender -9.681833e-04 NA NA
Religion 1.794378e-03 NA NA
Employment -2.685932e-04 NA NA
Income_median 4.498425e-03 NA NA
EnglishSpeak 1.726043e-03 NA NA
EnglishDiff 1.334770e-03 NA NA
See.Family -2.002236e-03 NA NA
Close.Family -5.934351e-04 NA NA
Helpful.Family 2.611360e-03 NA NA
See.Friends -7.542945e-04 NA NA
Close.Friends -1.573132e-03 NA NA
Helpful.Friends 6.476850e-04 NA NA
Family.Respect -5.441438e-04 NA NA
Similar.Values 3.513048e-04 NA NA
Successful.Family 2.577504e-04 NA NA
Trust 1.661835e-04 NA NA
Loyalty 3.849821e-04 NA NA
Family.Pride -5.585242e-04 NA NA
Expression 4.167576e-05 NA NA
Spend.Time.Together 6.177973e-06 NA NA
Feel.Close -8.730384e-04 NA NA
Togetherness -7.008496e-04 NA NA
Religious.Attendance -1.812429e-03 NA NA
Religious.Importance -1.024334e-03 NA NA
Close.knit.Community 7.397242e-05 NA NA
Helpful.Community -4.507673e-04 NA NA
Community.Shares.Values 1.940180e-04 NA NA
Get.Along -9.124803e-05 NA NA
Community.Trust 8.462331e-05 NA NA
Health.Insurance 2.294108e-02 NA NA
Dental.Insurance 3.090500e-02 NA NA
Discrimination -1.006681e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="PChk_test_VIMP.png",width=5,height=5,units="in")Dental Checkup
ps(`Dentist Check-up`)# A tibble: 3 × 3
`Dentist Check-up` n pct
<fct> <int> <dbl>
1 0 1100 42.2
2 Yes 1462 56.0
3 <NA> 47 1.80
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1915
Frequency of class labels: 786, 1129
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 472.9973
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1210
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.4364
(OOB) Brier score: 0.19312832
(OOB) Normalized Brier score: 0.77251328
(OOB) AUC: 0.76845319
(OOB) Log-loss: 0.5707148
(OOB) PR-AUC: 0.66442725
(OOB) G-mean: 0.69989128
(OOB) Requested performance error: 0.30010872
Confusion matrix:
predicted
observed 0 Yes class.error
0 566 220 0.2799
Yes 361 768 0.3198
(OOB) Misclassification rate: 0.3033943
plot(imb,plots.one.page = F)

all 0 Yes
Dental.Insurance 0.0207 NA NA
Health.Insurance 0.0023 NA NA
Income_median 0.0014 NA NA
Helpful.Community 0.0012 NA NA
EnglishDiff 0.0006 NA NA
Community.Trust 0.0002 NA NA
Spend.Time.Together -0.0006 NA NA
EnglishSpeak -0.0006 NA NA
Helpful.Friends -0.0011 NA NA
Employment -0.0012 NA NA
Family.Respect -0.0014 NA NA
Feel.Close -0.0015 NA NA
Togetherness -0.0016 NA NA
Loyalty -0.0017 NA NA
Family.Pride -0.0017 NA NA
Helpful.Family -0.0017 NA NA
Religion -0.0020 NA NA
Community.Shares.Values -0.0021 NA NA
Trust -0.0023 NA NA
Close.Friends -0.0025 NA NA
Discrimination -0.0026 NA NA
Close.knit.Community -0.0026 NA NA
See.Family -0.0028 NA NA
Expression -0.0031 NA NA
Successful.Family -0.0040 NA NA
Get.Along -0.0042 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1129.0000000 786.0000000 1.4363868 0.4104439 0.7201018 0.6802480
prec npv misclass brier brier.norm auc
0.6105717 0.7773279 0.3033943 0.1931283 0.7725133 0.7684532
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5707148 0.6608290 0.6916811 0.4104439 0.6644272 0.6803601
F1modgmean gmean
0.6957862 0.6998913
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1533
Frequency of class labels: 754, 779
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 299.2413
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 969
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0332
(OOB) Brier score: 0.15203825
(OOB) Normalized Brier score: 0.60815301
(OOB) AUC: 0.897689
(OOB) Log-loss: 0.47986324
(OOB) PR-AUC: 0.89419853
(OOB) G-mean: 0.81348703
(OOB) Requested performance error: 0.18651297
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 616 138 0.183
0 148 631 0.190
(OOB) Misclassification rate: 0.1865623
print(rfobj) Sample size: 1533
Frequency of class labels: 754, 779
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 299.2413
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 969
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0332
(OOB) Brier score: 0.15203825
(OOB) Normalized Brier score: 0.60815301
(OOB) AUC: 0.897689
(OOB) Log-loss: 0.47986324
(OOB) PR-AUC: 0.89419853
(OOB) G-mean: 0.81348703
(OOB) Requested performance error: 0.18651297
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 616 138 0.183
0 148 631 0.190
(OOB) Misclassification rate: 0.1865623
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Ethnicity 0.0382 NA NA
Religion 0.0295 NA NA
Dental.Insurance 0.0195 NA NA
Religious.Importance 0.0189 NA NA
EnglishDiff 0.0182 NA NA
EnglishSpeak 0.0156 NA NA
Gender 0.0156 NA NA
Helpful.Friends 0.0137 NA NA
Religious.Attendance 0.0136 NA NA
See.Friends 0.0130 NA NA
Age 0.0123 NA NA
Close.Friends 0.0123 NA NA
See.Family 0.0091 NA NA
Helpful.Community 0.0084 NA NA
Close.knit.Community 0.0084 NA NA
Successful.Family 0.0084 NA NA
Expression 0.0084 NA NA
Get.Along 0.0078 NA NA
Similar.Values 0.0078 NA NA
Helpful.Family 0.0071 NA NA
Discrimination 0.0065 NA NA
Spend.Time.Together 0.0065 NA NA
Close.Family 0.0065 NA NA
Community.Trust 0.0058 NA NA
Community.Shares.Values 0.0052 NA NA
Togetherness 0.0052 NA NA
rfobj$importance all Yes 0
Ethnicity 3.819788e-02 NA NA
Age 1.234635e-02 NA NA
Gender 1.559907e-02 NA NA
Religion 2.945558e-02 NA NA
Employment 5.166942e-03 NA NA
Income_median 2.563610e-03 NA NA
EnglishSpeak 1.559907e-02 NA NA
EnglishDiff 1.823009e-02 NA NA
See.Family 9.096205e-03 NA NA
Close.Family 6.467539e-03 NA NA
Helpful.Family 7.117568e-03 NA NA
See.Friends 1.298946e-02 NA NA
Close.Friends 1.234104e-02 NA NA
Helpful.Friends 1.367991e-02 NA NA
Family.Respect 4.534265e-03 NA NA
Similar.Values 7.791365e-03 NA NA
Successful.Family 8.423430e-03 NA NA
Trust 4.524794e-03 NA NA
Loyalty 1.304900e-03 NA NA
Family.Pride -2.720675e-05 NA NA
Expression 8.422373e-03 NA NA
Spend.Time.Together 6.471758e-03 NA NA
Feel.Close 4.534265e-03 NA NA
Togetherness 5.173261e-03 NA NA
Religious.Attendance 1.364585e-02 NA NA
Religious.Importance 1.886188e-02 NA NA
Close.knit.Community 8.423430e-03 NA NA
Helpful.Community 8.426602e-03 NA NA
Community.Shares.Values 5.204855e-03 NA NA
Get.Along 7.801931e-03 NA NA
Community.Trust 5.816979e-03 NA NA
Health.Insurance 3.354449e-03 NA NA
Dental.Insurance 1.954565e-02 NA NA
Discrimination 6.471758e-03 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="DChk_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
225.0000000 157.0000000 1.4331210 0.4109948 0.8025478 0.7022222
prec npv misclass brier brier.norm auc
0.6528497 0.8359788 0.2565445 0.1879495 0.7517980 0.7904600
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5594116 0.7200000 0.7410109 0.4109948 0.6829062 0.7353555
F1modgmean gmean
0.7458609 0.7507109
test_rf$importance all Yes 0
Ethnicity 8.508110e-03 NA NA
Age 8.271317e-03 NA NA
Gender 1.358924e-03 NA NA
Religion 6.568436e-03 NA NA
Employment -4.547879e-04 NA NA
Income_median 4.703719e-03 NA NA
EnglishSpeak 8.740063e-03 NA NA
EnglishDiff 5.355667e-03 NA NA
See.Family 3.062052e-03 NA NA
Close.Family 2.657587e-03 NA NA
Helpful.Family 1.579786e-03 NA NA
See.Friends 3.530472e-03 NA NA
Close.Friends 1.913067e-03 NA NA
Helpful.Friends 2.426828e-03 NA NA
Family.Respect 4.885193e-04 NA NA
Similar.Values -1.385758e-05 NA NA
Successful.Family 7.502447e-04 NA NA
Trust 7.173918e-04 NA NA
Loyalty 1.029924e-03 NA NA
Family.Pride 8.650342e-04 NA NA
Expression -3.067024e-04 NA NA
Spend.Time.Together 2.456691e-03 NA NA
Feel.Close 3.199467e-04 NA NA
Togetherness 2.767349e-04 NA NA
Religious.Attendance 6.427582e-04 NA NA
Religious.Importance 9.328929e-04 NA NA
Close.knit.Community -1.041299e-03 NA NA
Helpful.Community 5.545616e-04 NA NA
Community.Shares.Values -2.149310e-04 NA NA
Get.Along 7.956774e-04 NA NA
Community.Trust -4.289379e-05 NA NA
Health.Insurance 6.140608e-03 NA NA
Dental.Insurance 6.922811e-02 NA NA
Discrimination 3.693327e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="DChk_test_VIMP.png",width=5,height=5,units="in")Urgent Care
ps(`Urgentcare`)# A tibble: 3 × 3
Urgentcare n pct
<fct> <int> <dbl>
1 0 2112 81.0
2 Yes 440 16.9
3 <NA> 57 2.18
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1908
Frequency of class labels: 1594, 314
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 357.6643
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1206
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 5.0764
(OOB) Brier score: 0.13517241
(OOB) Normalized Brier score: 0.54068965
(OOB) AUC: 0.59929952
(OOB) Log-loss: 0.43902022
(OOB) PR-AUC: 0.23059263
(OOB) G-mean: 0.56307797
(OOB) Requested performance error: 0.43692203
Confusion matrix:
predicted
observed 0 Yes class.error
0 818 776 0.4868
Yes 120 194 0.3822
(OOB) Misclassification rate: 0.4696017
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0126 NA NA
Family.Pride 0.0055 NA NA
Discrimination 0.0055 NA NA
Spend.Time.Together 0.0046 NA NA
Health.Insurance 0.0046 NA NA
Helpful.Family 0.0039 NA NA
Close.Family 0.0037 NA NA
Trust 0.0022 NA NA
Similar.Values 0.0022 NA NA
Dental.Insurance 0.0008 NA NA
Employment 0.0003 NA NA
Togetherness 0.0000 NA NA
Income_median 0.0000 NA NA
Helpful.Community -0.0005 NA NA
See.Friends -0.0015 NA NA
Loyalty -0.0021 NA NA
Close.knit.Community -0.0021 NA NA
Expression -0.0025 NA NA
Community.Shares.Values -0.0033 NA NA
Feel.Close -0.0036 NA NA
Family.Respect -0.0040 NA NA
Helpful.Friends -0.0060 NA NA
Successful.Family -0.0068 NA NA
Ethnicity -0.0068 NA NA
EnglishDiff -0.0071 NA NA
Gender -0.0072 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1594.0000000 314.0000000 5.0764331 0.1645702 0.6178344 0.5131744
prec npv misclass brier brier.norm auc
0.2000000 0.8720682 0.4696017 0.1351724 0.5406897 0.5992995
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.4390202 0.3021807 0.4117806 0.1645702 0.2305926 0.4326293
F1modgmean gmean
0.4874293 0.5630780
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1528
Frequency of class labels: 751, 777
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 305.359
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 966
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0346
(OOB) Brier score: 0.15050494
(OOB) Normalized Brier score: 0.60201977
(OOB) AUC: 0.93951437
(OOB) Log-loss: 0.48064091
(OOB) PR-AUC: 0.92901033
(OOB) G-mean: 0.86360265
(OOB) Requested performance error: 0.13639735
Confusion matrix:
predicted
observed 0 Yes class.error
0 640 111 0.1478
Yes 97 680 0.1248
(OOB) Misclassification rate: 0.1361257
print(rfobj) Sample size: 1528
Frequency of class labels: 751, 777
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 305.359
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 966
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0346
(OOB) Brier score: 0.15050494
(OOB) Normalized Brier score: 0.60201977
(OOB) AUC: 0.93951437
(OOB) Log-loss: 0.48064091
(OOB) PR-AUC: 0.92901033
(OOB) G-mean: 0.86360265
(OOB) Requested performance error: 0.13639735
Confusion matrix:
predicted
observed 0 Yes class.error
0 640 111 0.1478
Yes 97 680 0.1248
(OOB) Misclassification rate: 0.1361257
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
Ethnicity 0.0292 NA NA
Religion 0.0258 NA NA
Community.Shares.Values 0.0232 NA NA
EnglishSpeak 0.0219 NA NA
Dental.Insurance 0.0206 NA NA
Close.Friends 0.0181 NA NA
Religious.Attendance 0.0180 NA NA
Income_median 0.0148 NA NA
EnglishDiff 0.0147 NA NA
Helpful.Community 0.0135 NA NA
Get.Along 0.0135 NA NA
Helpful.Family 0.0132 NA NA
Close.knit.Community 0.0127 NA NA
See.Friends 0.0123 NA NA
Community.Trust 0.0116 NA NA
Religious.Importance 0.0114 NA NA
Close.Family 0.0109 NA NA
Helpful.Friends 0.0109 NA NA
Togetherness 0.0104 NA NA
Employment 0.0089 NA NA
Family.Pride 0.0084 NA NA
Age 0.0082 NA NA
Discrimination 0.0076 NA NA
See.Family 0.0064 NA NA
Expression 0.0063 NA NA
Similar.Values 0.0063 NA NA
rfobj$importance all 0 Yes
Ethnicity 0.0291504093 NA NA
Age 0.0081546189 NA NA
Gender 0.0057340537 NA NA
Religion 0.0257942205 NA NA
Employment 0.0089111982 NA NA
Income_median 0.0148154717 NA NA
EnglishSpeak 0.0219076271 NA NA
EnglishDiff 0.0146781895 NA NA
See.Family 0.0064375131 NA NA
Close.Family 0.0108874751 NA NA
Helpful.Family 0.0132269812 NA NA
See.Friends 0.0122813487 NA NA
Close.Friends 0.0180530851 NA NA
Helpful.Friends 0.0108633588 NA NA
Family.Respect -0.0002767768 NA NA
Similar.Values 0.0063445514 NA NA
Successful.Family 0.0062925772 NA NA
Trust 0.0062486015 NA NA
Loyalty -0.0016068460 NA NA
Family.Pride 0.0084190950 NA NA
Expression 0.0063445514 NA NA
Spend.Time.Together 0.0036739590 NA NA
Feel.Close 0.0024163880 NA NA
Togetherness 0.0104384214 NA NA
Religious.Attendance 0.0179953247 NA NA
Religious.Importance 0.0114493763 NA NA
Close.knit.Community 0.0127434595 NA NA
Helpful.Community 0.0134819482 NA NA
Community.Shares.Values 0.0232494227 NA NA
Get.Along 0.0134597741 NA NA
Community.Trust 0.0116424583 NA NA
Health.Insurance 0.0043946796 NA NA
Dental.Insurance 0.0205984596 NA NA
Discrimination 0.0075578444 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="UC_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
318.0000000 62.0000000 5.1290323 0.1631579 0.4354839 0.6132075
prec npv misclass brier brier.norm auc
0.1800000 0.8478261 0.4157895 0.1381798 0.5527191 0.5213025
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.4504602 0.2547170 0.3751603 0.1631579 0.1688369 0.3857390
F1modgmean gmean
0.4459607 0.5167611
test_rf$importance all 0 Yes
Ethnicity -4.404010e-04 NA NA
Age 6.294261e-03 NA NA
Gender -1.319354e-03 NA NA
Religion -1.944322e-03 NA NA
Employment 3.060151e-03 NA NA
Income_median 4.626348e-05 NA NA
EnglishSpeak -2.650589e-03 NA NA
EnglishDiff 6.401704e-03 NA NA
See.Family 1.796675e-03 NA NA
Close.Family 1.474619e-03 NA NA
Helpful.Family 2.581160e-03 NA NA
See.Friends -1.812413e-03 NA NA
Close.Friends 2.267354e-03 NA NA
Helpful.Friends 7.554440e-04 NA NA
Family.Respect -9.647341e-04 NA NA
Similar.Values -2.101901e-05 NA NA
Successful.Family 1.055152e-03 NA NA
Trust -8.026600e-04 NA NA
Loyalty -3.004785e-04 NA NA
Family.Pride -1.110644e-03 NA NA
Expression -1.192176e-03 NA NA
Spend.Time.Together 1.265061e-04 NA NA
Feel.Close -5.187257e-04 NA NA
Togetherness 2.747022e-04 NA NA
Religious.Attendance -2.154685e-03 NA NA
Religious.Importance -6.844490e-04 NA NA
Close.knit.Community 1.337990e-03 NA NA
Helpful.Community 5.608645e-03 NA NA
Community.Shares.Values 5.225066e-04 NA NA
Get.Along 1.170703e-03 NA NA
Community.Trust 1.138418e-03 NA NA
Health.Insurance 1.600614e-03 NA NA
Dental.Insurance 6.147797e-03 NA NA
Discrimination -6.239883e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="UC_test_VIMP.png",width=5,height=5,units="in")Folk Medicine
ps(`Folkmedicine`)# A tibble: 3 × 3
Folkmedicine n pct
<fct> <int> <dbl>
1 0 2189 83.9
2 Yes 348 13.3
3 <NA> 72 2.76
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`,`Health Insurance`,`Dental Insurance`,`Discrimination`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="auc")
print(imb) Sample size: 1899
Frequency of class labels: 1642, 257
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 306.211
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 1200
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 6.3891
(OOB) Brier score: 0.11175568
(OOB) Normalized Brier score: 0.44702273
(OOB) AUC: 0.67616364
(OOB) Log-loss: 0.37586696
(OOB) PR-AUC: 0.23627522
(OOB) G-mean: 0.62877744
(OOB) Requested performance error: 0.37122256
Confusion matrix:
predicted
observed 0 Yes class.error
0 970 672 0.4093
Yes 85 172 0.3307
(OOB) Misclassification rate: 0.3986309
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0299 NA NA
Ethnicity 0.0196 NA NA
Helpful.Friends 0.0083 NA NA
EnglishSpeak 0.0072 NA NA
Togetherness 0.0066 NA NA
Feel.Close 0.0062 NA NA
Family.Pride 0.0061 NA NA
Family.Respect 0.0060 NA NA
Religion 0.0060 NA NA
Close.knit.Community 0.0051 NA NA
Community.Trust 0.0050 NA NA
Trust 0.0047 NA NA
Close.Friends 0.0046 NA NA
EnglishDiff 0.0046 NA NA
See.Friends 0.0044 NA NA
Employment 0.0042 NA NA
Religious.Importance 0.0034 NA NA
Dental.Insurance 0.0031 NA NA
Loyalty 0.0031 NA NA
Get.Along 0.0026 NA NA
Health.Insurance 0.0019 NA NA
Similar.Values 0.0019 NA NA
See.Family 0.0018 NA NA
Expression 0.0011 NA NA
Helpful.Community 0.0009 NA NA
Helpful.Family -0.0002 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1642.0000000 257.0000000 6.3891051 0.1353344 0.6692607 0.5907430
prec npv misclass brier brier.norm auc
0.2037915 0.9194313 0.3986309 0.1117557 0.4470227 0.6761636
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.3758670 0.3124432 0.4356551 0.1353344 0.2362752 0.4706103
F1modgmean gmean
0.5322163 0.6287774
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.8,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="auc")
print(rfobj) Sample size: 1520
Frequency of class labels: 747, 773
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 289.0813
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 961
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0348
(OOB) Brier score: 0.13602902
(OOB) Normalized Brier score: 0.54411608
(OOB) AUC: 0.94618318
(OOB) Log-loss: 0.44503014
(OOB) PR-AUC: 0.94488817
(OOB) G-mean: 0.86839658
(OOB) Requested performance error: 0.13160342
Confusion matrix:
predicted
observed 0 Yes class.error
0 632 115 0.1539
Yes 84 689 0.1087
(OOB) Misclassification rate: 0.1309211
print(rfobj) Sample size: 1520
Frequency of class labels: 747, 773
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 289.0813
No. of variables tried at each split: 6
Total no. of variables: 34
Resampling used to grow trees: swor
Resample size used to grow trees: 961
Analysis: RFQ
Family: class
Splitting rule: auc *random*
Number of random split points: 10
Imbalanced ratio: 1.0348
(OOB) Brier score: 0.13602902
(OOB) Normalized Brier score: 0.54411608
(OOB) AUC: 0.94618318
(OOB) Log-loss: 0.44503014
(OOB) PR-AUC: 0.94488817
(OOB) G-mean: 0.86839658
(OOB) Requested performance error: 0.13160342
Confusion matrix:
predicted
observed 0 Yes class.error
0 632 115 0.1539
Yes 84 689 0.1087
(OOB) Misclassification rate: 0.1309211
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
Ethnicity 0.0295 NA NA
EnglishSpeak 0.0153 NA NA
Religion 0.0153 NA NA
Religious.Importance 0.0135 NA NA
Discrimination 0.0111 NA NA
Community.Shares.Values 0.0103 NA NA
EnglishDiff 0.0103 NA NA
Close.knit.Community 0.0091 NA NA
Religious.Attendance 0.0089 NA NA
Age 0.0086 NA NA
Helpful.Community 0.0076 NA NA
Helpful.Friends 0.0075 NA NA
Dental.Insurance 0.0073 NA NA
Close.Friends 0.0069 NA NA
See.Friends 0.0064 NA NA
Family.Pride 0.0061 NA NA
Community.Trust 0.0058 NA NA
Gender 0.0053 NA NA
Close.Family 0.0052 NA NA
Get.Along 0.0050 NA NA
Spend.Time.Together 0.0046 NA NA
Helpful.Family 0.0044 NA NA
Loyalty 0.0041 NA NA
Trust 0.0041 NA NA
Feel.Close 0.0040 NA NA
Family.Respect 0.0034 NA NA
rfobj$importance all 0 Yes
Ethnicity 2.948927e-02 NA NA
Age 8.644268e-03 NA NA
Gender 5.327054e-03 NA NA
Religion 1.525852e-02 NA NA
Employment 2.472414e-03 NA NA
Income_median 5.755348e-04 NA NA
EnglishSpeak 1.525852e-02 NA NA
EnglishDiff 1.032383e-02 NA NA
See.Family -5.583749e-05 NA NA
Close.Family 5.212687e-03 NA NA
Helpful.Family 4.373457e-03 NA NA
See.Friends 6.423337e-03 NA NA
Close.Friends 6.897620e-03 NA NA
Helpful.Friends 7.523027e-03 NA NA
Family.Respect 3.380869e-03 NA NA
Similar.Values 2.634425e-03 NA NA
Successful.Family 2.472414e-03 NA NA
Trust 4.069851e-03 NA NA
Loyalty 4.069851e-03 NA NA
Family.Pride 6.140097e-03 NA NA
Expression 2.634425e-03 NA NA
Spend.Time.Together 4.639075e-03 NA NA
Feel.Close 4.009743e-03 NA NA
Togetherness -7.415481e-04 NA NA
Religious.Attendance 8.867885e-03 NA NA
Religious.Importance 1.349836e-02 NA NA
Close.knit.Community 9.057301e-03 NA NA
Helpful.Community 7.595451e-03 NA NA
Community.Shares.Values 1.032383e-02 NA NA
Get.Along 5.008068e-03 NA NA
Community.Trust 5.790696e-03 NA NA
Health.Insurance 1.176693e-04 NA NA
Dental.Insurance 7.273614e-03 NA NA
Discrimination 1.111232e-02 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
ggsave(filename="Folk_train_VIMP.png",width=5,height=5,units="in")Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T,outcome="test")
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
328.0000000 51.0000000 6.4313725 0.1345646 0.6078431 0.5762195
prec npv misclass brier brier.norm auc
0.1823529 0.9043062 0.4195251 0.1152065 0.4608261 0.6171090
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.3875304 0.2805430 0.4011915 0.1345646 0.1712250 0.4361816
F1modgmean gmean
0.4965058 0.5918201
test_rf$importance all 0 Yes
Ethnicity 4.808677e-03 NA NA
Age 2.162911e-02 NA NA
Gender -1.226642e-03 NA NA
Religion -2.983475e-05 NA NA
Employment 2.670185e-03 NA NA
Income_median 5.172185e-04 NA NA
EnglishSpeak 9.061304e-03 NA NA
EnglishDiff 3.219174e-03 NA NA
See.Family -3.480972e-04 NA NA
Close.Family -6.965218e-04 NA NA
Helpful.Family -5.081808e-03 NA NA
See.Friends -3.446965e-03 NA NA
Close.Friends 3.575321e-03 NA NA
Helpful.Friends -1.391379e-03 NA NA
Family.Respect 3.186908e-04 NA NA
Similar.Values 9.355895e-04 NA NA
Successful.Family 5.843771e-04 NA NA
Trust -6.103914e-04 NA NA
Loyalty -9.823498e-04 NA NA
Family.Pride -3.188944e-03 NA NA
Expression -3.027179e-04 NA NA
Spend.Time.Together -2.377881e-03 NA NA
Feel.Close -3.196601e-03 NA NA
Togetherness 1.070865e-04 NA NA
Religious.Attendance -3.521653e-03 NA NA
Religious.Importance -1.907589e-03 NA NA
Close.knit.Community -1.757698e-03 NA NA
Helpful.Community -1.920971e-03 NA NA
Community.Shares.Values -4.778563e-03 NA NA
Get.Along -2.845168e-03 NA NA
Community.Trust -2.973031e-03 NA NA
Health.Insurance -8.981269e-04 NA NA
Dental.Insurance -1.803215e-03 NA NA
Discrimination 4.059630e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance*100)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "black") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
ggsave(filename="Folk_test_VIMP.png",width=5,height=5,units="in")